Loading modules
import pandas as pd
from plotly import __version__
import plotly.tools as tls
import plotly.plotly as py
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode()
Importing data
fams_repland = pd.read_csv("../ipython_data/fams_teleost.RepeatLandscapes.fixed.csv",
skipinitialspace=True, sep = " ", index_col=False)
annot = pd.read_csv("../ipython_data/names.tab", encoding = "UTF-8", sep = "\t",
names = ["ALIAS", "ORDER", "SPECIES", "COMMON_NAME"])
rl = pd.merge(annot, fams_repland)
# The de novo dataframe is exactly as the total data frame in structure,
# but without the repeats detected only in RepBase.
denovo = pd.read_csv("../ipython_data/denovo_only.dataframe", sep = "\t", index_col = False, na_values='na',
names = ["ALIAS", "NAME", "CLASS", "FAM", "CLASS_FAM", "FULL",
"LENGTH", "FRG", "FULL_FRG", "NR_FRG", "AVG_DIV",
"MED_DIV", "AVG_DEL", "MED_DEL", "AVG_INS", "MED_INS",
"LEN_MASKED", "AVG_LEN_MASKED", "MED_LEN_MASKED", "GENOME_PERC",
"LEN_OVERLAP", "GENOME_PERC_OVERLAP", "LEN_MASKED_OVERLAP"])
denovo = pd.merge(denovo, annot)
# Changing wrongly spelled name of Haddock.
denovo['COMMON_NAME'][denovo.COMMON_NAME == 'Haddoc'] = 'Haddock'
alias74 = pd.read_table('../ipython_data/74aliases', names = ['ALIAS'])
Data manipulation steps for making CDA graphs
fam_plot = rl.groupby(['ALIAS', 'SPECIES', 'Rfam', 'ORDER', 'COMMON_NAME']).sum().reset_index()
fam_plot = pd.merge(alias74, fam_plot)
fam_plot.columns.values[5:] = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20',
'21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37',
'38','39','40','41','42','43','44','45','46','47','48','49','50']
I am interested in the activities of the top five TE families/superfamilies in each fish genome.
tt_frame = denovo.groupby(['SPECIES', 'FAM']).sum()[['GENOME_PERC']].reset_index()
# This function will give me the top five (the number can be changed) TE families by genome coverage
def get_top_fams(species, number):
return tt_frame[(tt_frame['SPECIES']==species) &
(tt_frame['FAM']!='DNA') &
(tt_frame['FAM']!='Simple_repeat') &
(tt_frame['FAM']!='Unknown') &
(tt_frame['FAM']!='Low_complexity') &
(tt_frame['FAM']!='LTR') &
(tt_frame['FAM']!='SINE') &
(tt_frame['FAM']!='LINE')].sort_values(by='GENOME_PERC', ascending = False).head(n = number)
This is the function for making CDA graphs
def repeat_landscape(df,fam):
def make_trace(df):
colors = {'hAT-Ac' : 'AEA01D',
'Gypsy' : '68402B',
'DIRS' : '2696D8',
'L2' : 'FDF7B5',
'TcMar-Tc1' : 'F93B67',
'Rex-Babar' : '8C5D58',
'PIF-Harbinger' : '22428B',
'RTE-BovB' : 'E2D3C9',
'hAT-Charlie' : 'A1AFC9',
'ERV1' : 'D67336',
'R2-Hero' : 'E63EAB',
'hAT-Tip100' : '96DEAF',
'Kolobok-T2' : '74578E',
'Pao' : '818092',
'Penelope' : '3A2E30',
'PIF-ISL2EU' : '95CEF4',
'RTE-X' : 'A8DACD',
'TcMar-Tc2' : '525D29',
'TcMar-Mariner' : 'Black',
'Jockey' : 'Blue',
'L1' : 'Purple',
'Maverick' : 'Pink',
'L1-Tx1' : 'Turqoise',
'Copia' : 'Gray',
'Gypsy' : 'Green',
'Gypsy-Cigr' : 'Black',
'PiggyBac' : 'White',
'CR1' : 'Gold',
'Dong-R4' : 'Silver',
'hAT' : 'Orange',
'TcMar-Tigger' : 'Red',
'Academ' : 'Beige',
'Crypton' : 'Blue'}
# Gets the index for each superfamily, in each fish
def make_dict(fam, df):
dicto = {}
for i in fam:
dicto.update({i : df[df['Rfam']==i].index.values[0]})
return dicto
data = []
for i in fam:
if i in df.Rfam.unique():
trace = go.Bar(
x = df.columns[1:],
y = df[df['Rfam']==i].loc[make_dict(fam, df)[i]][1:1000],
name = i,
marker=dict(color = colors[i]),
)
data.append(trace)
return data
stacklayout = go.Layout(
barmode = 'stack',
xaxis = dict(autotick = False, #title = 'Divergence from consensus (%)',
zeroline = True,
showline = True,
showticklabels = True,
mirror='ticks',
zerolinewidth = 2,
linewidth = 2,
dtick = 5),
yaxis = dict(autotick = True, #title = 'Percentage of genome (%)',
autorange = True, showgrid = False,
zeroline = True,
showline = True,
mirror = 'ticks',
range = [0,1,2,3,4,5],
zerolinewidth = 2,
linewidth = 2),
height = 300,
width = 700,
plot_bgcolor = 'White',
paper_bgcolor = 'White',
title = '%s (%s)' % (df[df['SPECIES']==i].SPECIES.unique()[0], df[df['SPECIES']==i].COMMON_NAME.unique()[0]),
showlegend = True,
legend = dict(font=dict(size=12), traceorder = 'normal'),
font = dict(size = 12)
)
return go.Figure(data = make_trace(df), layout = stacklayout)
# Generating CDA graphs for each fish
for i in set(fam_plot.SPECIES):
iplot(repeat_landscape(fam_plot[fam_plot['SPECIES']==i],
get_top_fams(i, 5).FAM.tolist()))
py.image.save_as(repeat_landscape(fam_plot[fam_plot['SPECIES']==i],
get_top_fams(i, 5).FAM.tolist()),
'../figures/RLfigures/%s.pdf' % fam_plot[fam_plot['SPECIES']==i].ALIAS.unique()[0]\
, format='pdf', scale=4)
from ete3 import Tree, faces, TreeStyle, TextFace, NodeStyle
def make_a_cool_figure(tree, order, filename):
# Importing full phylogeny
t = Tree(tree)
# Deleting missing data that I do not have a RepeatLandscape for.
fish_94 = t.search_nodes(name="fish_94")[0]
fish_94.delete()
fish_107 = t.search_nodes(name="fish_107")[0]
fish_107.delete()
"""# Include only this order:
exclude = set(fam_plot[fam_plot['ORDER']!=order].ALIAS)
mylist = ['Merluccius polli',
'Merluccius merluccius', 'Merluccius capensis']
exclude = set(fam_plot[~fam_plot['SPECIES'].isin(mylist)].ALIAS)"""
# For every node label, get the names
fish = []
for node in t.traverse():
if 'fish' in node.name:
fish.append(node.name)
# For every element not being the chosen order, and if it actually is in the phylogeny, delete the node.
"""for element in exclude:
if element in fish:
delfish = t.search_nodes(name=element)[0]
delfish.delete(preserve_branch_length = True)"""
# Create the layout
def mylayout(node):
if node.is_leaf():
for i in fish:
if node.search_nodes(name = i):
# Add the repeatlandscape
node.img_style["size"] = 1
faces.add_face_to_node(faces.ImgFace('../figures/RLfigures/%s.png' % i),
node, column=0, aligned=True)
#if i in set(fam_plot.ALIAS):
# faces.add_face_to_node(faces.TextFace('%s' % fam_plot[fam_plot['ALIAS']==i].SPECIES.unique()[0], fsize = 10),
# node, column=0)
# Display the tree
ts = TreeStyle()
#ts.complete_branch_lines_when_necessary = True
ts.show_leaf_name = False
ts.show_branch_length = False
ts.show_scale = False
ts.layout_fn = mylayout
#ts.scale = None
ts.tree_width = 1000
#ts.title.add_face(TextFace("%s" % order, fsize=76), column = 0)
for n in t.traverse():
nstyle = NodeStyle()
nstyle["fgcolor"] = "Black"
nstyle["size"] = 2
n.set_style(nstyle)
t.render(filename, tree_style = ts)
return "Done"
make_a_cool_figure('../phylogeny/final_tree.tre', '', '../figures/OrderScapes/%s_tree.pdf' % '3.may_test_RL_tree')